#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
import time
import json
from pprint import pprint

from calibre.web.feeds.news import BasicNewsRecipe


class OrangeCountyRegister(BasicNewsRecipe):
    title = u'Orange County Register'
    __author__ = 'TechnoCat'
    description = 'The O.C. Register\nRecipe: Nov 2016'
    cover_url = 'http://www.realclearpolitics.com/dev/mt-static/images/logo.gif'
    custom_title = 'OC Register - ' + time.strftime('%d %b %Y')
    auto_cleanup = True
    extra_css = 'div.metaAuthor { display:none;}\n'
    encoding = 'utf8'
    language = 'en'
    needs_subscription = False
    no_stylesheets = True
    oldest_article = 7
    remove_javascript = True
    remove_tags = [dict(name='img', attrs={})]
    # Don't go down
    recursions = 0
    max_articles_per_feed = 400
    debugMessages = False

    feeds = [('News', 'https://www.ocregister.com/news/'),
             ('Opinion', 'https://www.ocregister.com/opinion/'),
             ('Politics', 'https://www.ocregister.com/news/politics/'),
             ('Business', 'https://www.ocregister.com/business/')]

    def parsePage(self, index):
        if self.debugMessages is True:
            print("\n\nStarting " + self.feeds[index][0])
        articleList = []
        soup = self.index_to_soup(self.feeds[index][1])
        # Have this index page now.
        # look for a.article-title
        # If any, the description is <div class="excerpt">
        for newsentry in soup.findAll("a", {"class": "article-title"}):
            print('Next up:')
            print(newsentry)
            title = newsentry["title"]
            url = newsentry['href']
            print("Title: ")
            print(title)
            print('URL')
            print(url)
            pubdate = time.strftime('%a, %d %b')
            articleList.append(
                dict(
                    title=title,
                    url=url,
                    date=pubdate,
                    description=title,
                    content=''
                )
            )
        return articleList

    def extract_readable_article(self, html, url):
        cleanedHTML = super(OrangeCountyRegister,
                            self).extract_readable_article(html, url)
        print("Processing html for author")
        # Find the attribs...
        attribDict = self.htmlToAttribsDict(html)
        print("dict is type...")
        print(type(attribDict))
        author = attribDict.get('Byline')
        if author is not None:
            # add author code after <body>
            print("Adding author in meta")
            print(author)
            cleanedHTML = cleanedHTML.replace(
                "<body>",
                "<body>\n<div class=\"metaAuthor\" value=\"" + author + "\"></div>\n"
            )
        else:
            print('no author found')
            print(html)
        # pubDate = attribDict.get('Publish Hour of Day')
        return cleanedHTML

    def loadURL(self, url):
        return self.index_to_soup(url, raw=True)

    def htmlToAttribsDict(self, rawHTML):
        tokenStart = 'dataLayer.push({'
        tokenEnd = '});'
        print("1")
        startJSON = rawHTML.find(tokenStart)
        if (startJSON < 0):
            return
        JSONBeginning = rawHTML[startJSON + len(tokenStart) - 1:]
        endJSON = JSONBeginning.find(tokenEnd)
        if (endJSON < 0):
            return
        JSON = JSONBeginning[:endJSON + 1]
        JSONQuoted = JSON.replace("'", "\"")
        try:
            metadata = json.loads(JSONQuoted)
            pprint(metadata)
            return metadata
        except ValueError:
            print("Could not decode JSON:")
            print(JSONQuoted)
        return None

    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    # this is used instead of BasicNewsRecipe.parse_feeds().
    def parse_index(self):
        # Parse the page into Python Soup
        # articleList = []
        ans = []
        feedsCount = len(self.feeds)
        for x in range(0, feedsCount - 1):  # should be ,4
            feedarticles = self.parsePage(x)
            if feedarticles is not None:
                ans.append((self.feeds[x][0], feedarticles))
        if self.debugMessages is True:
            print(ans)
        return ans
